(1) Data Curation

Preparations

Prepare files and variables so that we can iterate through all of our code automatically

Read in Data

Read in each treatment separately and merge into one object. Normalization can be performed on individual datasets or a combined dataset; however, there are ramifications for choosing to normalize as a group (ie. merging control and treatment) or normalizing each dataset separately.

data_a_raw  <- Read10X(data.dir = "/Users/cmay/Desktop/rstudio/jenn_avm_scrna/378a/files/")
data_a      <- CreateSeuratObject(counts = data_a_raw, min.cells = 3, min.features  = 200, project = "378a", assay = "RNA")

data_b_raw  <- Read10X(data.dir = "/Users/cmay/Desktop/rstudio/jenn_avm_scrna/378b/files/") 
data_b      <- CreateSeuratObject(counts = data_b_raw, min.cells = 3, min.features  = 200, project = "378b", assay = "RNA")

data_c_raw  <- Read10X(data.dir = "/Users/cmay/Desktop/rstudio/jenn_avm_scrna/410/files/") 
data_c      <- CreateSeuratObject(counts = data_c_raw, min.cells = 3, min.features  = 200, project = "410", assay = "RNA")

data_d_raw  <- Read10X(data.dir = "/Users/cmay/Desktop/rstudio/jenn_avm_scrna/control/files/") 
data_d      <- CreateSeuratObject(counts = data_d_raw, min.cells = 3, min.features  = 200, project = "control", assay = "RNA")

avm_treatment <- merge(x = data_a, y = data_b, add.cell.ids = NULL, merge.data = TRUE)
avm_control   <- merge(x = data_c, y = data_d, add.cell.ids = NULL, merge.data = TRUE)

avm <- merge(x = data_a, y = c(data_b, data_c, data_d), add.cell.ids = NULL, merge.data = TRUE)
RenameCells(object = avm, add.cell.id = "avm")
## An object of class Seurat 
## 15244 features across 18601 samples within 1 assay 
## Active assay: RNA (15244 features)

Check the object

Confirm that your data was properly read in and slotted into the Seurat Object

# Check the metadata
head(avm@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA
## AAACCTGAGATCCCGC_1       378a       6326         1594
## AAACCTGAGATGTAAC_1       378a       5990         1708
## AAACCTGAGCCCAACC_1       378a       5029         1567
## AAACCTGAGCCGCCTA_1       378a       2630          887
## AAACCTGCAATGGAAT_1       378a       5648         1828
## AAACCTGCACCGAAAG_1       378a       6100         1570
tail(avm@meta.data)
##                    orig.ident nCount_RNA nFeature_RNA
## TTTGTCAAGTTACGGG_4    control       4988         1381
## TTTGTCACACCAGGCT_4    control       3599         1148
## TTTGTCACATGTCGAT_4    control       7085         1745
## TTTGTCACATTAGGCT_4    control       3229          972
## TTTGTCACATTCACTT_4    control       3363         1160
## TTTGTCATCCCTCAGT_4    control      10615         2537

(2) Clean Data

Quality Control

High mitochondrial RNA is indicative of damaged or dead cells, thus should be filtered

#Mitochondrial RNA
mito.genes1 <- grep(pattern = "^MT", x = rownames(avm@assays[["RNA"]]), value = TRUE) #^MT for human or ^mt- for mice
percent.mito1 <- Matrix::colSums(avm@assays[["RNA"]][mito.genes1])/Matrix::colSums(avm@assays[["RNA"]])
avm <- AddMetaData(object = avm, metadata = percent.mito1, col.name = "percent.mito1") 

#Ribosomal RNA
ribo.genes1 <- grep(pattern = "^^RP[SL]", x = rownames(avm@assays[["RNA"]]), value = TRUE)
percent.ribo1 <- Matrix::colSums(avm@assays[["RNA"]][ribo.genes1])/Matrix::colSums(avm@assays[["RNA"]])
avm <- AddMetaData(object = avm, metadata = percent.ribo1, col.name = "percent.ribo1") 

VlnPlot(object = avm, group.by="orig.ident", features = c("nFeature_RNA", "nCount_RNA", "percent.mito1","percent.ribo1"), ncol=2, pt.size=0)  #Change the size of the beeswarm points

Cell Cycle Regression

Determine if you should regress out cell cycle genes from the model. If you see clusters correlating with specific cell cycle phases (S, G1, G2M), you may want to regres out cell cycle genes. Caution is advised if your dataset includes progenitor or stem cells, as proliferation may be colinear with cell type identity. First plot the data after performin normal regression (which can include counts, percent mitochondria and percent ribosomes). Next, regress out cell cycle genes (CC.Difference) and check the plot for changes. S, G1 and G2M should be more randomly scattered in the UMAP plot.

# A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat. 
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes

#Assign Cell Cycle Scores
avm <- CellCycleScoring(avm, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)

#View cell cycle scores and phase assignments
head(avm[[]])
##                    orig.ident nCount_RNA nFeature_RNA percent.mito1
## AAACCTGAGATCCCGC_1       378a       6326         1594    0.09215934
## AAACCTGAGATGTAAC_1       378a       5990         1708    0.08647746
## AAACCTGAGCCCAACC_1       378a       5029         1567    0.10180950
## AAACCTGAGCCGCCTA_1       378a       2630          887    0.18441065
## AAACCTGCAATGGAAT_1       378a       5648         1828    0.11419972
## AAACCTGCACCGAAAG_1       378a       6100         1570    0.11278689
##                    percent.ribo1      S.Score  G2M.Score Phase old.ident
## AAACCTGAGATCCCGC_1     0.3188429 -0.009238666 -0.1578514    G1      378a
## AAACCTGAGATGTAAC_1     0.2681135  0.512631879 -0.4926310     S      378a
## AAACCTGAGCCCAACC_1     0.2463710 -0.003849444 -0.1554622    G1      378a
## AAACCTGAGCCGCCTA_1     0.2760456 -0.063473054 -0.3277677    G1      378a
## AAACCTGCAATGGAAT_1     0.2478754  0.006643855  0.2086011   G2M      378a
## AAACCTGCACCGAAAG_1     0.3319672 -0.152095808 -0.7533732    G1      378a
avm <- FindVariableFeatures(object = avm, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5, nfeatures = 6000)

#Run PCA to see if cells segregate along cell cycle phase
avm <- ScaleData(object = avm, vars.to.regress = c("nCounts_RNA","percent.mito1"))  #can also add "percent.ribo1"
avm <- RunPCA(avm, features = c(s.genes, g2m.genes))

Cell Cycle Correlation

Plot to check that no clusters are correlated with cell cycle phases

DimPlot(avm)

Regress Cell Cycle Genes

To correct for Cell Cycle Genes, use ScaleData to regress the difference between G2M and S phase scores

avm$CC.Difference <- avm$S.Score - avm$G2M.Score
avm <- ScaleData(avm, vars.to.regress = "CC.Difference", features = rownames(avm))
avm <- RunPCA(avm, features = VariableFeatures(avm), nfeatures.print = 10)

Replot Cell Cycle Genes

Re-plot to confirm no clusters are correlated with cell cycle phase

DimPlot(avm,reduction = "pca", group.by="orig.ident")

Mitochondria vs Transcripts

nCount_RNA is the total number of transcripts in the cell nFeature_RNA is the number of unique genes detected in the cell

plot1 <- FeatureScatter(object = avm, feature1 = "percent.mito1", feature2 = "nCount_RNA")
plot2 <- FeatureScatter(object = avm, feature1 = "percent.ribo1", feature2 = "nCount_RNA")
plot3 <- FeatureScatter(object = avm, feature1 = "G2M.Score", feature2 = "percent.ribo1")
plot4 <- FeatureScatter(object = avm, feature1 = "S.Score", feature2 = "percent.mito1")

CombinePlots(plots = list(plot1, plot2, plot3, plot4))

Filter Cells

Filter out cells based on number of genes and percent mitochondrial genes Less than 200-500 genes indicate damaged cells. More than 5,000 genes could indicate multiplets.

avm <- subset(x = avm, subset = nFeature_RNA > 200 & nFeature_RNA < 5000  & percent.mito1 < 20)
avm <- subset(x = avm, subset = nCount_RNA > 600 & nCount_RNA < 20000)

(3) Visualize Data

Plot Candidate Genes

Dimensional reduction plot, with cells colored by a quantitative feature

plot3 <- FeaturePlot(object = avm, features = "PECAM1")
plot4 <- FeaturePlot(object = avm, features = "MMP1")

CombinePlots(plots = list(plot3, plot4))

Violin Plots

# Violin  plot
VlnPlot(object = avm, features = c("CDH5", "PECAM1","MMP1"), same.y.lims = TRUE, group.by="orig.ident", pt.size=0)

Heatmaps

DimHeatmap shows primary sources of heterogeneity in a avm and can be useful when trying to decide which PCs to include for further downstream analyses. For instance, if cell cycle genes are high here, it may indicate that you should regress out cell cycle genes.

DimHeatmap(object = avm, reduction = "pca", dims=2, cells = 100, nfeature=100, balanced = TRUE)

(4) Clustering the Cells

Seurat uses a graph-based clustering approach: K-nearest neighbor (KNN) graph, with edges drawn between cells with similar gene expression patterns, and then attempt to partition this graph into highly interconnected ‘quasi-cliques’ or ‘communities’.

Detection of Variable Genes

FindVariableGenes calculates the average expression and dispersion for each gene, places these genes into bins, and then calculates a z-score for dispersion within each bin. This helps control for the relationship between variability and average expression.

#These parameters will vary based on the data set, heterogeneity in the sample and normalization strategy.
avm <- FindVariableFeatures(object = avm, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5, nfeatures = 5000)
head(x = HVFInfo(object = avm))
##                      mean    variance variance.standardized
## RP11-34P13.7  0.001697484 0.001694705             0.9756154
## AP006222.2    0.176902092 0.194846593             0.9594373
## RP11-206L10.9 0.004486208 0.004587609             0.9970272
## LINC00115     0.001818733 0.001815535             0.9754871
## FAM41C        0.011821764 0.011925231             0.9615060
## NOC2L         0.004849955 0.004826725             0.9690061
variable_genes <- (x = HVFInfo(object = avm))
write.csv(variable_genes, "/Users/cmay/Desktop/rstudio/avm_variable_genes.csv")

Perform Normalization

Calculate k-nearest neighbors and construct the SNN graph

## Normalize the Data
avm <- SCTransform(avm, vars.to.regress = "percent.ribo1", verbose = FALSE)
avm <- FindVariableFeatures(avm, mean.function = ExpMean, 
          dispersion.function = LogVMR, x.low.cutoff = 0.0125, 
          x.high.cutoff = 3, y.cutoff = 0.5, nfeatures = 5000) 
avm <- RunPCA(avm,npcs = 50,rev.pca = FALSE, 
          weight.by.var = TRUE,ndims.print = 1:5,
          nfeatures.print = 30,seed.use = 42,approx = TRUE) 

Find Clusters

Computing Nearest Neighbor Graph

avm <- FindNeighbors(avm,dims = 1:30, # Dimensions
           k.param = 15, # K neighbors
           nn.method = "annoy",
           annoy.metric = "hamming") # This is the distance metric

avm <- FindClusters(avm, resolution = 0.6, 
                    algorithm = 3, 
                    dims.use=1:30, 
                    n.start = 10,
                    n.iter = 100,
                    save.SNN=TRUE)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 16495
## Number of edges: 481661
## 
## Running smart local moving algorithm...
## Maximum modularity in 10 random starts: 0.8353
## Number of communities: 10
## Elapsed time: 61 seconds
save(avm, file = "/Users/cmay/Desktop/rstudio/avm_clusters.csv")

Run UMAP

avm <- RunUMAP(avm, reduction = "pca", dims = 1:30,  # can add umap.method = "umap-learn" but don't need
           metric = "hamming", n.neighbors = 20, min.dist = 0, # This allows for tighter clustering
           n.epochs = 1000, # This repeats the algorithm 1000x to get most consistent result
           local.connectivity = 3, # minimum 3 neighbors
           verbose = TRUE)
avm_umap1 <- DimPlot(avm, group.by="Phase")
avm_umap2 <- DimPlot(avm, group.by="orig.ident")
avm_umap3 <- DimPlot(avm, group.by="SCT_snn_res.0.6")

CombinePlots(plots = list(avm_umap1, avm_umap2, avm_umap3,avm_umap3))

(5) Differentially Expressed Genes

Seurat can help you find markers that define clusters via differential expression. FindAllMarkers automatically identifes positive and negative markers of a single cluster compared to all other cells. You can also test groups of clusters vs. each other, or against all cells.

Markers for clusters

# Find all markers of Cluster 1
cluster0.markers <- FindMarkers(object = avm, ident.1 = 0, min.pct = 0.25)
print(x = head(x = cluster0.markers, n = 20))
##          p_val avg_logFC pct.1 pct.2 p_val_adj
## MMP1         0 1.3645712 0.987 0.541         0
## CYTL1        0 0.9384128 0.856 0.475         0
## HLA-B        0 0.8582702 0.909 0.485         0
## ADGRF5       0 0.7019012 0.694 0.262         0
## RPL13A       0 0.6983088 1.000 0.998         0
## TRIML2       0 0.6627174 0.585 0.094         0
## NEAT1        0 0.6561820 0.989 0.879         0
## RPL3         0 0.6293213 1.000 0.995         0
## SPARC        0 0.6263405 0.995 0.940         0
## NUAK1        0 0.6226915 0.634 0.197         0
## RPS19        0 0.6193133 0.946 0.653         0
## RPL27A       0 0.6085498 0.999 0.974         0
## TPT1         0 0.5733333 1.000 1.000         0
## SULF2        0 0.5654225 0.707 0.334         0
## ADAMTS18     0 0.5614392 0.722 0.406         0
## S100A6       0 0.5585444 1.000 0.999         0
## A2M          0 0.5508741 0.362 0.039         0
## PECAM1       0 0.4566564 0.983 0.879         0
## RPS21        0 0.4513789 0.898 0.661         0
## FILIP1       0 0.4479808 0.421 0.064         0
write.csv(cluster0.markers, "/Users/cmay/Desktop/rstudio/cluster0.markers.csv")

# Find markers for every cluster compared to all remaining cells, report only the positive ones
#avm.all.markers <- FindAllMarkers(object = avm, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25, test.use="MAST")
avm.all.markers <- FindAllMarkers(object = avm, test.use="MAST")
avm.all.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)
## # A tibble: 20 x 7
## # Groups:   cluster [10]
##        p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene    
##        <dbl>     <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>   
##  1 0.            1.36  0.987 0.541 0.        0       MMP1    
##  2 0.            0.938 0.856 0.475 0.        0       CYTL1   
##  3 0.            1.88  0.939 0.404 0.        1       TFPI2   
##  4 0.            0.802 0.734 0.356 0.        1       SPOCK1  
##  5 0.            1.12  0.973 0.568 0.        2       MMP1    
##  6 0.            0.964 0.671 0.23  0.        2       MT2A    
##  7 0.            0.878 1     0.96  0.        3       TUBA1B  
##  8 0.            0.860 1     0.992 0.        3       HMGB1   
##  9 0.            0.924 0.814 0.142 0.        4       XIST    
## 10 0.            0.581 0.815 0.491 0.        4       EMCN    
## 11 0.            1.62  0.87  0.459 0.        5       FABP4   
## 12 0.            1.36  0.763 0.206 0.        5       ALDH1A1 
## 13 6.33e-259     0.760 0.491 0.132 9.11e-255 6       IL1RL1  
## 14 9.38e- 89     0.748 0.446 0.309 1.35e- 84 6       HIST1H4C
## 15 0.            0.973 0.851 0.145 0.        7       PDPN    
## 16 0.            0.929 0.936 0.559 0.        7       MGST1   
## 17 0.            1.91  0.9   0.407 0.        8       MGP     
## 18 0.            1.59  0.882 0.469 0.        8       FABP4   
## 19 0.            2.59  0.959 0.01  0.        9       COL1A2  
## 20 0.            2.27  0.987 0.632 0.        9       FN1
write.csv(avm.all.markers, "/Users/cmay/Desktop/rstudio/avm.all.markers.csv")

Heatmap

Generates an expression heatmap for given cells and genes. Plot the top 10 markers for each cluster.

top10 <- avm.all.markers %>% group_by(cluster) %>% top_n(10, avg_logFC)
avm_heatmap <- DoHeatmap(object = avm, features = top10$gene, label = TRUE)
avm_heatmap

Violin Plot

Visualizing marker expression by Treatment Group and by Clusters VlnPlot (shows expression probability distributions across clusters)

avm_violinplot_genes1a <- VlnPlot(object = avm,features=c("PECAM1","MGP","CDH5","SPARC","MMP1","ALDH1A1"), same.y.lims = TRUE, group.by="orig.ident", pt.size=0)
avm_violinplot_genes1b <- VlnPlot(object = avm,features=c("PECAM1","MGP","CDH5","SPARC","MMP1","ALDH1A1"), same.y.lims = TRUE, pt.size=0)

avm_violinplot_genes1a

avm_violinplot_genes1b

Violin Plot 2

Visualizing marker expression by Treatment Group and by Clusters cont. VlnPlot (shows expression probability distributions across clusters)

avm_violinplot_genes2a <- VlnPlot(object = avm,features=c("TGFBI","FN1","CXCL8","GLIPR1","TFPI2","CCL2"), same.y.lims = TRUE, group.by="orig.ident", pt.size=0)
avm_violinplot_genes2b <- VlnPlot(object = avm,features=c("TGFBI","FN1","CXCL8","GLIPR1","TFPI2","CCL2"), same.y.lims = TRUE, pt.size=0)

avm_violinplot_genes2a

avm_violinplot_genes2b

UMAP plot by Genes

FeaturePlot (visualizes gene expression on a tSNE or PCA plot)

avm_featureplot_genes <- FeaturePlot(object = avm, features = c("PECAM1","TGFBI","CDH5","ALDH1A1","SPARC","MMP1"), cols = c("grey", "blue"), reduction = "umap")
avm_featureplot_genes

RidgePlot

avm_ridgeplot_genes <- RidgePlot(object = avm,features=c("PECAM1","TGFBI","CDH5","ALDH1A1","SPARC","MMP1"))
avm_ridgeplot_genes